package spimedb.media;
import com.google.common.base.Joiner;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.impl.Jdk14Logger;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.ContentHandlerFactory;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.jpedal.jbig2.jai.JBIG2ImageReaderSpi;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.helpers.DefaultHandler;
import spimedb.MutableNObject;
import spimedb.NObject;
import spimedb.Plugin;
import spimedb.SpimeDB;
import javax.imageio.spi.IIORegistry;
import java.awt.image.BufferedImage;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.util.List;
import java.util.function.BiFunction;
import java.util.logging.Level;
/**
* Detects document and multimedia metadata, and schedules further processing
* <p>
* https://svn.apache.org/repos/asf/tika/trunk/tika-example/src/main/java/org/apache/tika/example/LuceneIndexerExtended.java
* https://svn.apache.org/repos/asf/tika/trunk/tika-example/src/main/java/org/apache/tika/example/SimpleTextExtractor.java
* https://github.com/apache/pdfbox/tree/trunk/examples/src/main/java/org/apache/pdfbox/examples
*/
public class Multimedia implements Plugin, BiFunction<NObject, NObject, NObject> {
public final static Logger logger = LoggerFactory.getLogger(Multimedia.class);
final Parser tika = new AutoDetectParser();
final ContentHandlerFactory tikaFactory = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1);
static final Cleaner cleaner = new Cleaner(Whitelist.basic());
private final float thumbnailQuality = 0.75f;
static final int pdfPageImageDPI = 32;
static {
for (String s : new String[]{"org.apache.pdfbox.rendering.CIDType0Glyph2D", "org.apache.pdfbox.pdmodel.font.PDTrueTypeFont"}) {
((Jdk14Logger) LogFactory.getLog(s)).getLogger().setLevel(Level.SEVERE);
}
IIORegistry.getDefaultInstance().registerServiceProvider(new JBIG2ImageReaderSpi());
}
private final SpimeDB db;
public Multimedia(SpimeDB db) {
this.db = db;
db.on(this);
//process existing items
db.forEach((x) -> {
db.runLater(0.5f, ()-> {
NObject y = apply(x, x);
if (y != x) {
db.add(y);
}
});
});
}
@Override
public NObject apply(NObject p, NObject x) {
final String url = x.get("url_in");
String xid = x.id();
if (url == null) {
return x;
}
try {
long exp;
InputStream stream;
long fileSize;
if (url.startsWith("file:")) {
File f = new File(url.substring(5));
exp = f.lastModified();
stream = new FileInputStream(f);
fileSize = f.length();
} else {
URL uu = new URL(url);
URLConnection con = uu.openConnection();
exp = con.getExpiration();
if (exp == 0)
exp = con.getLastModified();
fileSize = con.getContentLengthLong();
stream = con.getInputStream();
}
if (stream == null) {
throw new FileNotFoundException();
}
//logger.info("in: {} {} {}", url, p!=null ? p.get("url_cached") : "null", x.get("url_cached"));
//TODO use a separate url_cached for each instance of a sibling class like Multimedia that does only one processing
//this way they can be enabled/disabled separately without interfering with each other
//TODO store a hashcode of the data as well as the time for additional integrity
if (p != null) {
String whenCached = p.get("url_cached");
if (!(whenCached == null || Long.valueOf(whenCached) < exp)) {
logger.debug("cached: {}", url);
return p; //still valid
}
}
logger.info("load: {}", url);
GeoNObject y = new GeoNObject(x);
y.put("url_cached", Long.toString(exp));
boolean isKMLorKMZ = url.endsWith(".kml") || url.endsWith(".kmz");
boolean isGeoJSON = url.endsWith(".geojson");
if (!isKMLorKMZ && !isGeoJSON /* handled separately below */) {
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
final RecursiveParserWrapper tikaWrapper = new RecursiveParserWrapper(tika, tikaFactory);
if (stream instanceof FileInputStream) {
y.put("data", url);
} else {
//buffer the bytes for saving
byte[] bytes = IOUtils.readFully(stream, (int) fileSize);
stream = new ByteArrayInputStream(bytes);
y.put("data", bytes);
}
tikaWrapper.parse(stream, new DefaultHandler(), metadata, context);
stream.close();
List<Metadata> m = tikaWrapper.getMetadata();
m.forEach(md -> {
for (String k : md.names()) {
String[] v = md.getValues(k);
String kk = tikiToField(k);
if (kk != null) {
Object vv = v.length > 1 ? v : v[0];
if (vv instanceof String) {
try {
int ivv = Integer.parseInt((String) vv);
vv = ivv;
} catch (Exception e) {
//not an int
}
}
y.put(kk, vv);
}
}
});
}
//db.addAsync(y).get();
//HACK run these after the updated 'y' is submitted in case these want to modify it when they run
if (isKMLorKMZ) {
new KML(db, y).url(url).run();
} else if (isGeoJSON) {
GeoJSON.load(url, GeoJSON.baseGeoJSONBuilder, db);
}
x = y;
} catch (Exception e) {
logger.error("url_in removal: {}", e);
}
Object mime = x.get(NObject.TYPE);
if (mime != null && (mime.equals("image/jpeg") || mime.equals("image/png") /* ... */)) {
x = new MutableNObject(x)
.name(titleify(xid))
.put(NObject.DESC, null)
.put("thumbnail", "data" /* redirect to the data field which already has the byte[] image */)
;
}
if ("application/pdf".equals(mime) && x.has("pageCount") && x.has(NObject.DESC) /* leaf */) {
int pageCount = x.get("pageCount");
//float docPri = Util.lerp(1f / (pageCount), 0.75f, 0.25f);
String parentContent = x.get(NObject.DESC);
String author = x.get("author");
//db.runLater(docPri, () -> {
Document parentDOM = Jsoup.parse(parentContent);
Elements pagesHTML = parentDOM.select(".page");
PDDocument document = null;
try {
InputStream is;
if (url.startsWith("file:")) {
is = fileStream(url);
} else {
is = new URL(url).openStream();
}
document = PDDocument.load(is);
PDFRenderer renderer = new PDFRenderer(document);
for (int _page = 0; _page < pageCount; _page++) {
final int pageActual = _page;
final int page = _page + 1;
logger.info("paginate: {} {}", xid, page);
Document pd = Document.createShell("");
pd.body().appendChild(pagesHTML.get(pageActual).removeAttr("class"));
Elements cc = cleaner.clean(pd).body().children();
String[] pdb = cc.stream()
.filter(xx -> !xx.children().isEmpty() || xx.hasText())
.map(xx -> xx.tagName().equals("p") ? xx.text() : xx) //just use <p> contents
.map(Object::toString).toArray(String[]::new);
// List<JsonNode> jdb = new ArrayList(pdb.size());
// pdb.forEach(e -> {
// if (e.children().isEmpty() && e.text().isEmpty())
// return;
// jdb.add(html2json(e));
// });
String docTitle = parentDOM.title(); //x.name();
if (docTitle == null || docTitle.isEmpty()) {
docTitle = titleify(xid);
}
BufferedImage img = renderer.renderImageWithDPI(pageActual, (float) pdfPageImageDPI, ImageType.RGB);
//boolean result = ImageIOUtil.writeImage(img, outputFile, pdfPageImageDPI);
ByteArrayOutputStream os = new ByteArrayOutputStream(img.getWidth() * img.getHeight() * 3 /* estimate */);
boolean result = ImageIOUtil.writeImage(img, "jpg", os, pdfPageImageDPI, thumbnailQuality);
byte[] thumbnail = os.toByteArray();
String text = pdb.length > 0 ? Joiner.on('\n').join(pdb) : null;
db.add(
new MutableNObject(xid + "/" + page)
.name(docTitle + " - (" + page + " of " + (pageCount + 1) + ")")
.withTags(xid)
.put("author", author)
.put("url", url) //HACK browser loads the specific page when using the '#' anchor
.put(NObject.TYPE, "application/pdf")
.put("data", xid + "#page=" + page)
.put("page", page)
.put(NObject.DESC, text)
/*.putLater("textParse", 0.1f, ()-> {
return (pdb.length > 0) ? Stream.of(pdb).map(
t -> NLP.toString(NLP.parse(t))
).collect(Collectors.joining("\n")) : null;
})*/
.put("thumbnail", thumbnail)
);
}
} catch (IOException f) {
logger.error("error: {} {}", xid, f);
} finally {
if (document != null)
try {
document.close();
} catch (IOException e) {
}
}
}
//clean and update parent DOM
//String xname = x.name();
//String desc = x.get(NObject.DESC);
x = new MutableNObject(x)
.name(titleify(xid))
.put(NObject.DESC, null)
/*.putLater("textParse", 0.15f, () -> {
return xname != null ? NLP.toString(NLP.parse(
Joiner.on("\n").skipNulls().join(xname, desc)
)) : null;
}) //parse the title + description
*/
;
return x;
}
@NotNull
private static FileInputStream fileStream(String url) throws FileNotFoundException {
return new FileInputStream(url.substring(5));
}
private static String titleify(String id) {
return URLDecoder.decode(id).replace("_", " ").trim();
}
@Nullable
static String tikiToField(String k) {
String m;
switch (k) {
case "dc:title":
return null; //duplicates
case "Last-Modified":
return null; //duplicates
case "pdf:docinfo:created":
return null; //duplicates
case "Creation-Date":
return null; //duplicates
case "created":
return null; //duplicates
case "creator":
return null;
case "meta:author":
return null;
case "meta:creation-date":
return null;
case "pdf:PDFVersion":
return null;
case "access_permission:can_modify":
return null;
case "access_permission:extract_for_accessibility":
return null;
case "access_permission:assemble_document":
return null;
case "access_permission:extract_content":
return null;
case "access_permission:fill_in_form":
return null;
case "producer":
return "generator";
case "pdf:docinfo:producer":
return null;
case "modified":
return null;
case "Last-Save-Date":
return null;
case "pdf:docinfo:modified":
return null;
case "meta:save-date":
return null;
case "meta:keyword":
return null;
case "cp:subject":
return null;
case "dc:creator":
return null;
case "dc:description":
return null;
case "dc:subject":
return null;
case "pdf:docinfo:creator":
return null;
case "pdf:docinfo:subject":
return null;
case "X-Parsed-By":
return null;
case "pdf:encrypted":
return null;
case "access_permission:modify_annotations":
return null;
case "access_permission:can_print_degraded":
return null;
case "access_permission:can_print":
return null;
case "pdf:docinfo:keywords":
return null;
case "Keywords":
return "keywords";
case "title":
m = "N";
break;
case "X-TIKA:content":
m = NObject.DESC;
break;
case "Author":
m = "author";
break;
case "Content-Type":
m = NObject.TYPE;
break;
case "xmpTPg:NPages":
m = "pageCount";
break;
case "pdf:docinfo:title":
m = null;
break; //duplicates "Title"
//TODO other duplcates
default:
m = k;
break;
}
return m;
}
// private static JsonNode html2json(Element e) {
//
// ObjectNode n = JSON.json.createObjectNode();
// boolean hasChildren = e.children().isEmpty();
// if (hasChildren)
// n.set(e.tagName(), JSON.json.valueToTree(e.children().stream().map(Multimedia::html2json).toArray(x -> new JsonNode[x])));
// if (e.hasText()) {
// n.set(hasChildren ? "_" : e.tagName(), JSON.json.valueToTree(e.textNodes().stream().map(x -> x.text()).toArray(x -> new String[x])));
// }
// return n;
// }
}